# -*- coding: utf-8 -*-
"""
-------------------------------------------------
   File Name：      lagou
   Description :
   Author :         samge
   date：            17-11-10
-------------------------------------------------
   Change Activity: 17-11-10:
-------------------------------------------------
"""
__author__ = 'samge'
from urllib import request
import re,json,time
import xlsxwriter
#mongoDB
from pymongo import mongo_client
heads = {
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Referer':'https://www.lagou.com/jobs/list_python?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=',
    'Cookie':'user_trace_token=20171110131501-c945a177-92a6-44cb-95b6-e09183eb751d; JSESSIONID=ABAAABAACEBACDG53B016A01676FEF2450AA3222BBE10AA; X_HTTP_TOKEN=a7b4220d0286966bd10cdb7d56330ebb; LGUID=20171110131525-28e1e3f7-c5d6-11e7-989f-5254005c3644; _putrc=DED33229FEE22DD2; login=true; unick=%E9%82%B5%E6%88%90%E6%8A%A5; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; TG-TRACK-CODE=index_search; _gid=GA1.2.2133680274.1510290926; _ga=GA1.2.1006573876.1510290926; LGRID=20171110142353-b91e26a0-c5df-11e7-866f-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510290926; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510295033; SEARCH_ID=7537594a6d034debb6f1483c55653db7; index_location_city=%E6%B7%B1%E5%9C%B3'
}

#创建一个excel表格
workbook = xlsxwriter.Workbook('Python招聘信息.xlsx')
#创建一个工作表
worksheet = workbook.add_worksheet("简单数据获取")

#分页获取招聘信息列表的方法
def getJobList(page):
    if page == 1:
        boo = 'true'
    else:
        boo = 'false'
    page_data = {
        'first': boo,
        'pn': page,
        'kd': 'Python'
    }
    url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false&isSchoolJob=0'
    req = request.Request(url,headers=heads)
    data = request.urlopen(req).read()
    #jsondata = json.loads(str(data, encoding='utf-8', errors='ignore'))
    # print(jsondata)
    resultArr = json.loads(str(data, encoding='utf-8', errors='ignore'))
    # print("当前页面=",json.loads(str(data, encoding='utf-8', errors='ignore'))["content"]["pageNo"])
    # print("每页显示数目=",resultArr["content"]["pageSize"])
    # print("总页数=",resultArr["content"]["positionResult"]["totalCount"])
    # print("总条数=",resultArr["content"]["positionResult"]["totalCount"] * 15)

    resultList = resultArr["content"]["positionResult"]["result"]
    return resultList
    # 获取招聘详情信息
    # print("数据类型=",type(resultList))
    # rList = list(resultList)
    # print("\n　        公司名：　　                      薪资：　                      职位名:                                 工作地点： 　　                   福利：\n")
    # for bean in rList:
    #     print('{0:<30s} {1:<20s} {2:<40s} {3:<20s} {4:<30s}'.format(bean["companyFullName"],bean["salary"],bean["positionName"],bean["city"],bean["positionAdvantage"]))


#方法-将数据写入工作表中
def writeToExcel(row = 0,companyFullName='公司名', salary='薪资', positionName='职位名', city='工作地点', positionAdvantage='福利',createTime='时间'):
    if row == 0:
        worksheet.write(row, 0, companyFullName)
        worksheet.write(row, 1, salary)
        worksheet.write(row, 2, positionName)
        worksheet.write(row, 3, city)
        worksheet.write(row, 4, positionAdvantage)
        worksheet.write(row, 5, createTime)
    else:
        worksheet.write(row, 0, bean["companyFullName"])
        worksheet.write(row, 1, bean["salary"])
        worksheet.write(row, 2, bean["positionName"])
        worksheet.write(row, 3, bean["city"])
        worksheet.write(row, 4, bean["positionAdvantage"])
        worksheet.write(row, 5, bean["createTime"])

if __name__ == "__main__":
    #mongoDB
    conn = mongo_client.MongoClient("mongodb://localhost:27017/")
    db = conn.jobDB
    # getJobList(page = 1)
    # 在第一行中写入列名
    writeToExcel(row=0)
    # 从第二行开始写入数据
    row = 1
    for page in range(1,10):
        db.col.insert(getJobList(page = page))
        for bean in getJobList(page = page):
            #写入数据库
            writeToExcel(row = row,companyFullName = bean["companyFullName"],salary = bean["salary"],positionName = bean["positionName"],city = bean["city"],positionAdvantage = bean["positionAdvantage"],createTime = bean["createTime"])
            row += 1
        print('第%d页数据已经爬取完成'%page)
        #适当睡眠一下
        time.sleep(0.5)
    #关闭表格
    workbook.close()
    print(db.col.find())
    db.col.close


